Cleaning and Features

Cleaning

Lets load in the data, as well as check the 81 variables that we have.

train <- fread('train.csv', stringsAsFactors = TRUE)
train <- data.frame(train)
test <- fread('test.csv', stringsAsFactors = TRUE)
test <- data.frame(test)
names(train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
# apply class to each element by columns 
#sapply(train, class)

str(train)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 16 12 22 15 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","CWD","Con",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

Missing values

It is important to make it a practice to check for missing values. If a particular variable has too many missing values, we may have to remove it

#Num_NA<-sapply(train,function(x)length(which(is.na(x)==T)))
Num_NA <- apply(train, 2, function(x)length(which(is.na(x)==T)))
missing.df <- data.frame(colnames(train), Num_NA)
missing.df <- missing.df[order(missing.df$Num_NA, decreasing = TRUE),]
head(missing.df, 10)
##              colnames.train. Num_NA
## PoolQC                PoolQC   1453
## MiscFeature      MiscFeature   1406
## Alley                  Alley   1369
## Fence                  Fence   1179
## FireplaceQu      FireplaceQu    690
## LotFrontage      LotFrontage    259
## GarageType        GarageType     81
## GarageYrBlt      GarageYrBlt     81
## GarageFinish    GarageFinish     81
## GarageQual        GarageQual     81

We can see the top variables with NA. We have to remove them. Lets remove:

  • poolqc
  • Miscfeature
  • alley
  • fence
train$PoolQC <- NULL
train$MiscFeature <- NULL 
train$Alley <- NULL
train$Fence <- NULL

test$PoolQC <- NULL
test$MiscFeature <- NULL
test$Alley <- NULL
test$Fence <- NULL

Features

# extract all numerical variables 
all.numeric.variables.index <- sapply(train,is.numeric)
all.numeric.variables <- train[,all.numeric.variables.index]

# convert all factors into numericals variables
for(i in 1:length(colnames(train))){
  if(is.factor(train[,i])){
    train[,i] <- as.integer(train[,i])
  }
}


# lets check
sapply(train, class)
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##      PoolArea       MiscVal        MoSold        YrSold      SaleType 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## SaleCondition     SalePrice 
##     "integer"     "integer"
# do the same for the test set so we can use it later on: 
for(i in 1:length(colnames(test))){
  if(is.factor(test[,i])){
    test[,i] <- as.integer(test[,i])
  }
}

Now, we will need to fill all the remaining NA as 0

train[is.na(train)] <- 0
all.numeric.variables[is.na(all.numeric.variables)] <- 0

# do the same to test
test[is.na(test)] <- 0 

Analysis

Focus the initial analysis on the numeric variables (not the ones we turned from factors to numerical) Since we do not need the id column for the correlation, we can remove it

correlation.df <- cor(all.numeric.variables[,-1])
corrplot(correlation.df, method = "color", tl.cex = 0.5, type = "lower", sig.level = 0.05, insig = "blank", order = "hclust", addCoef.col = "black", number.cex = 0.35 ,diag = FALSE )

Seems like we have some variables that have strong relationship with each other. The suspects are:

#pairs(~SalePrice+OverallQual+TotalBsmtSF+X1stFlrSF+GarageArea+GarageCars+GrLivArea , data = train)

featurePlot(x = train[,c("OverallQual","TotalBsmtSF","X1stFlrSF","GarageArea","GarageCars","GrLivArea")] ,
y = train[,"SalePrice"],
plot = "pairs",
# add key at the top 
auto.key = list(columns = 3)
)

Very obvious that multicollinearity exist in this variables.

Sales Price analysis

plot_ly(all.numeric.variables, x = ~YearBuilt, y= ~SalePrice,type = 'scatter', mode = 'markers', name = "scatter") %>%
add_trace(y = fitted(lm(train$SalePrice ~ train$YearBuilt)), mode = 'lines', line = list(shape = "spline", smoothing = 1), name = 'smooth')
#ggplot(train, aes(x= YearBuilt, y = SalePrice)) + geom_point() + geom_smooth()

A general upward trend the older the house, the higher the price level of the house.

Modelling

Lets do a partition using caret package to prep the train set, before we go into the modelling aspect.

set.seed(random)
outcome <- train$SalePrice
partition <- createDataPartition(y= outcome, p= 0.599, list =F)
# split train into 2: 6:4 
training_split <- train[partition,]
testing_split <- train[-partition,]

Linear Regression Model

Most basic model will be the regression model. We start by fitting all the variables to have a look:

reg.model <- lm(SalePrice ~. , data = training_split)
summary(reg.model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = training_split)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -328846  -14683    -755   13186  314877 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.104e+06  1.797e+06   2.284 0.022622 *  
## Id            -1.033e+00  2.756e+00  -0.375 0.707804    
## MSSubClass    -1.194e+02  6.388e+01  -1.869 0.061932 .  
## MSZoning      -1.068e+03  2.110e+03  -0.506 0.612991    
## LotFrontage   -5.290e+01  3.736e+01  -1.416 0.157213    
## LotArea        1.714e-01  1.485e-01   1.154 0.248812    
## Street         3.084e+04  1.647e+04   1.873 0.061491 .  
## LotShape      -1.338e+03  9.127e+02  -1.466 0.143056    
## LandContour    3.756e+03  1.780e+03   2.110 0.035161 *  
## Utilities     -5.210e+04  3.518e+04  -1.481 0.139093    
## LotConfig      1.250e+02  7.462e+02   0.168 0.867012    
## LandSlope      1.283e+04  4.965e+03   2.584 0.009950 ** 
## Neighborhood   4.458e+02  2.082e+02   2.141 0.032543 *  
## Condition1    -2.395e+02  1.246e+03  -0.192 0.847655    
## Condition2    -8.349e+03  4.530e+03  -1.843 0.065720 .  
## BldgType      -1.552e+03  2.118e+03  -0.733 0.463792    
## HouseStyle    -2.085e+03  8.670e+02  -2.404 0.016431 *  
## OverallQual    1.286e+04  1.589e+03   8.096 2.11e-15 ***
## OverallCond    4.395e+03  1.416e+03   3.103 0.001981 ** 
## YearBuilt      2.188e+02  1.029e+02   2.126 0.033801 *  
## YearRemodAdd   6.795e+01  9.051e+01   0.751 0.453033    
## RoofStyle      2.996e+03  1.550e+03   1.933 0.053629 .  
## RoofMatl       1.987e+03  1.842e+03   1.078 0.281175    
## Exterior1st    5.157e+02  7.125e+02   0.724 0.469441    
## Exterior2nd   -6.296e+02  6.454e+02  -0.976 0.329565    
## MasVnrType     2.534e+03  2.036e+03   1.245 0.213457    
## MasVnrArea     2.929e+01  8.607e+00   3.403 0.000700 ***
## ExterQual     -4.693e+03  2.735e+03  -1.716 0.086540 .  
## ExterCond     -3.919e+02  1.648e+03  -0.238 0.812125    
## Foundation     1.408e+02  2.309e+03   0.061 0.951416    
## BsmtQual      -7.762e+03  1.882e+03  -4.123 4.12e-05 ***
## BsmtCond       5.102e+03  1.778e+03   2.869 0.004226 ** 
## BsmtExposure  -3.024e+03  1.197e+03  -2.526 0.011737 *  
## BsmtFinType1  -9.580e+02  8.149e+02  -1.176 0.240098    
## BsmtFinSF1     1.306e+00  7.168e+00   0.182 0.855425    
## BsmtFinType2   3.522e+03  1.580e+03   2.229 0.026105 *  
## BsmtFinSF2     1.626e+01  1.062e+01   1.532 0.125965    
## BsmtUnfSF     -1.974e+00  6.935e+00  -0.285 0.775999    
## TotalBsmtSF           NA         NA      NA       NA    
## Heating       -6.193e+02  4.598e+03  -0.135 0.892902    
## HeatingQC     -8.036e+02  8.305e+02  -0.968 0.333493    
## CentralAir    -2.406e+02  6.153e+03  -0.039 0.968815    
## Electrical    -5.725e+02  1.181e+03  -0.485 0.628115    
## X1stFlrSF      3.724e+01  8.376e+00   4.447 9.96e-06 ***
## X2ndFlrSF      4.730e+01  6.516e+00   7.258 9.27e-13 ***
## LowQualFinSF   2.529e+01  2.391e+01   1.058 0.290359    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   1.196e+04  3.268e+03   3.661 0.000268 ***
## BsmtHalfBath   3.955e+03  5.142e+03   0.769 0.442102    
## FullBath       2.266e+03  3.535e+03   0.641 0.521664    
## HalfBath      -3.634e+03  3.454e+03  -1.052 0.293030    
## BedroomAbvGr  -1.373e+03  2.289e+03  -0.600 0.548854    
## KitchenAbvGr  -1.414e+04  7.317e+03  -1.932 0.053678 .  
## KitchenQual   -8.758e+03  1.958e+03  -4.472 8.86e-06 ***
## TotRmsAbvGrd   1.811e+03  1.618e+03   1.119 0.263438    
## Functional     1.984e+03  1.460e+03   1.359 0.174437    
## Fireplaces     6.698e+03  3.629e+03   1.846 0.065298 .  
## FireplaceQu   -8.139e+02  1.085e+03  -0.750 0.453408    
## GarageType    -5.351e+02  8.633e+02  -0.620 0.535529    
## GarageYrBlt   -2.520e+00  8.547e+00  -0.295 0.768178    
## GarageFinish  -1.827e+03  1.953e+03  -0.936 0.349686    
## GarageCars     1.759e+04  3.987e+03   4.413 1.16e-05 ***
## GarageArea    -3.241e+00  1.303e+01  -0.249 0.803639    
## GarageQual    -2.246e+03  2.287e+03  -0.982 0.326487    
## GarageCond     1.208e+03  2.781e+03   0.434 0.664208    
## PavedDrive     2.855e+03  2.843e+03   1.004 0.315676    
## WoodDeckSF     1.192e+01  1.003e+01   1.189 0.234621    
## OpenPorchSF   -1.921e+01  1.962e+01  -0.979 0.327813    
## EnclosedPorch  1.874e+01  2.230e+01   0.840 0.401072    
## X3SsnPorch     3.852e+01  3.899e+01   0.988 0.323447    
## ScreenPorch    6.634e+01  2.324e+01   2.855 0.004418 ** 
## PoolArea      -3.896e+01  3.934e+01  -0.990 0.322278    
## MiscVal       -2.283e+00  3.716e+00  -0.614 0.539064    
## MoSold         2.072e+02  4.397e+02   0.471 0.637564    
## YrSold        -2.322e+03  8.841e+02  -2.626 0.008803 ** 
## SaleType      -9.708e+01  7.772e+02  -0.125 0.900625    
## SaleCondition  2.109e+03  1.066e+03   1.979 0.048162 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32540 on 801 degrees of freedom
## Multiple R-squared:  0.8318, Adjusted R-squared:  0.8162 
## F-statistic: 53.52 on 74 and 801 DF,  p-value: < 2.2e-16

Very good R-square value, which says that the explainatory variables are explaining SalePrices well. At the same time, there are many variables which have no effect on SalePrices. We can now implement the stepwise algorithm for variable selection, based on AIC values.

reg.model.2 <- lm(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt + 
    KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + 
    ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + 
    BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea + 
    YrSold + ScreenPorch + Street + SaleCondition + GarageFinish + 
    BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd + 
    LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood + 
    KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure + 
    Condition1, data = training_split)

summary(reg.model.2)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt + 
##     KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + 
##     ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + 
##     BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea + 
##     YrSold + ScreenPorch + Street + SaleCondition + GarageFinish + 
##     BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd + 
##     LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood + 
##     KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure + 
##     Condition1, data = training_split)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -352046  -15169   -1050   13586  328482 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.116e+06  1.736e+06   2.371 0.017965 *  
## OverallQual    1.333e+04  1.538e+03   8.666  < 2e-16 ***
## GrLivArea      4.197e+01  5.253e+00   7.989 4.49e-15 ***
## YearBuilt      2.076e+02  6.923e+01   2.999 0.002790 ** 
## KitchenQual   -9.041e+03  1.884e+03  -4.798 1.89e-06 ***
## BsmtFullBath   1.189e+04  2.585e+03   4.601 4.85e-06 ***
## MSSubClass    -1.896e+02  3.318e+01  -5.715 1.52e-08 ***
## GarageCars     1.463e+04  2.305e+03   6.349 3.55e-10 ***
## LotArea        2.425e-01  1.389e-01   1.746 0.081189 .  
## ExterQual     -5.598e+03  2.595e+03  -2.157 0.031304 *  
## OverallCond    4.963e+03  1.186e+03   4.184 3.17e-05 ***
## PoolArea      -5.150e+01  3.586e+01  -1.436 0.151375    
## WoodDeckSF     1.707e+01  9.681e+00   1.763 0.078218 .  
## Exterior1st    6.099e+02  6.933e+02   0.880 0.379278    
## BsmtQual      -7.792e+03  1.789e+03  -4.355 1.50e-05 ***
## BsmtCond       4.816e+03  1.667e+03   2.889 0.003967 ** 
## Fireplaces     4.474e+03  2.164e+03   2.068 0.038976 *  
## MasVnrType     2.631e+03  1.954e+03   1.347 0.178465    
## MasVnrArea     2.860e+01  8.065e+00   3.546 0.000413 ***
## YrSold        -2.288e+03  8.583e+02  -2.666 0.007819 ** 
## ScreenPorch    6.559e+01  2.230e+01   2.941 0.003361 ** 
## Street         3.382e+04  1.598e+04   2.116 0.034623 *  
## SaleCondition  2.037e+03  1.009e+03   2.019 0.043812 *  
## GarageFinish  -3.186e+03  1.441e+03  -2.211 0.027283 *  
## BsmtFinType2   3.374e+03  1.455e+03   2.318 0.020690 *  
## BsmtFinType1  -1.433e+03  7.154e+02  -2.004 0.045410 *  
## BsmtFinSF2     1.235e+01  9.302e+00   1.327 0.184826    
## TotRmsAbvGrd   1.319e+03  1.568e+03   0.841 0.400514    
## LandSlope      1.138e+04  4.578e+03   2.485 0.013166 *  
## EnclosedPorch  1.486e+01  2.147e+01   0.692 0.489055    
## Functional     1.857e+03  1.413e+03   1.314 0.189233    
## BedroomAbvGr  -1.790e+03  2.163e+03  -0.827 0.408276    
## Neighborhood   4.202e+02  1.990e+02   2.111 0.035064 *  
## KitchenAbvGr  -1.425e+04  6.819e+03  -2.090 0.036923 *  
## X1stFlrSF     -3.801e+00  4.516e+00  -0.842 0.400242    
## FullBath       3.971e+03  3.106e+03   1.278 0.201460    
## Exterior2nd   -6.115e+02  6.308e+02  -0.969 0.332651    
## BsmtExposure  -2.752e+03  1.134e+03  -2.426 0.015457 *  
## Condition1    -1.838e+02  1.190e+03  -0.154 0.877311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32620 on 837 degrees of freedom
## Multiple R-squared:  0.8233, Adjusted R-squared:  0.8153 
## F-statistic: 102.7 on 38 and 837 DF,  p-value: < 2.2e-16

Now, the model looks very good, and all the variables have an effect on SalePrice, Let us check the RMSE metric defined by kaggle.

We do this by doing a out of sample comparison: We obtain the prediction by using our model on the testing_split data set and compare with the actual test_split data value.

Some residual diagnosis: heteroskadesticity presence

We can plot residual vs fitted values to see if there is any pattern: if there is, heteroskadesticity is present.

In fact, we can use a code to check:

bptest(reg.model.2)
## 
##  studentized Breusch-Pagan test
## 
## data:  reg.model.2
## BP = 471.76, df = 38, p-value < 2.2e-16
model <- ols(SalePrice ~ OverallQual + GrLivArea + YearBuilt + 
    KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + 
    ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + 
    BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea + 
    YrSold + ScreenPorch + Street + SaleCondition + GarageFinish + 
    BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd + 
    LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood + 
    KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure + 
    Condition1 + Id, data = training_split, x = TRUE)
reg.model.2.corrected <- robcov(model)
reg.model.2.corrected
## Linear Regression Model
##  
##  ols(formula = SalePrice ~ OverallQual + GrLivArea + YearBuilt + 
##      KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + 
##      ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + 
##      BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea + 
##      YrSold + ScreenPorch + Street + SaleCondition + GarageFinish + 
##      BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd + 
##      LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood + 
##      KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure + 
##      Condition1 + Id, data = training_split, x = TRUE)
##  
##                      Model Likelihood     Discrimination    
##                         Ratio Test           Indexes        
##  Obs         876    LR chi2    1518.96    R2       0.823    
##  sigma32631.8551    d.f.            39    R2 adj   0.815    
##  d.f.        836    Pr(> chi2)  0.0000    g    75940.692    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -351535  -15200   -1117   13251  328446 
##  
##  
##                Coef         S.E.         t     Pr(>|t|)
##  Intercept     4112810.6849 1648810.4804  2.49 0.0128  
##  OverallQual     13251.3602    2072.8919  6.39 <0.0001 
##  GrLivArea          41.9887      14.1715  2.96 0.0031  
##  YearBuilt         207.9856      92.3151  2.25 0.0245  
##  KitchenQual     -9049.4636    2158.2764 -4.19 <0.0001 
##  BsmtFullBath    11842.4359    2222.5609  5.33 <0.0001 
##  MSSubClass       -188.4168      40.0264 -4.71 <0.0001 
##  GarageCars      14694.3636    3951.3366  3.72 0.0002  
##  LotArea             0.2376       0.2295  1.04 0.3009  
##  ExterQual       -5617.8979    3545.8978 -1.58 0.1135  
##  OverallCond      4965.8627    1207.0570  4.11 <0.0001 
##  PoolArea          -50.6519     220.8765 -0.23 0.8187  
##  WoodDeckSF         16.8371       9.5924  1.76 0.0796  
##  Exterior1st       627.8258     935.6123  0.67 0.5024  
##  BsmtQual        -7787.1083    1866.4897 -4.17 <0.0001 
##  BsmtCond         4803.0730    1334.5870  3.60 0.0003  
##  Fireplaces       4455.8031    2660.2048  1.67 0.0943  
##  MasVnrType       2678.2581    2972.2692  0.90 0.3678  
##  MasVnrArea         28.5386      15.3670  1.86 0.0636  
##  YrSold          -2286.8082     790.0984 -2.89 0.0039  
##  ScreenPorch        65.5507      34.8310  1.88 0.0602  
##  Street          33937.8914   16371.8040  2.07 0.0385  
##  SaleCondition    2022.3028    1220.9625  1.66 0.0980  
##  GarageFinish    -3225.6361    1387.3886 -2.32 0.0203  
##  BsmtFinType2     3416.7908    1123.5414  3.04 0.0024  
##  BsmtFinType1    -1418.2795     588.3831 -2.41 0.0161  
##  BsmtFinSF2         12.6422       7.4720  1.69 0.0910  
##  TotRmsAbvGrd     1324.9740    2363.9788  0.56 0.5753  
##  LandSlope       11357.3707    7591.7576  1.50 0.1350  
##  EnclosedPorch      14.7319      30.2178  0.49 0.6260  
##  Functional       1872.8096    1346.4032  1.39 0.1646  
##  BedroomAbvGr    -1761.1074    2927.6072 -0.60 0.5476  
##  Neighborhood      418.9487     245.1648  1.71 0.0879  
##  KitchenAbvGr   -14284.3589    5239.0428 -2.73 0.0065  
##  X1stFlrSF          -3.6696      10.2509 -0.36 0.7204  
##  FullBath         3983.8102    4672.2094  0.85 0.3941  
##  Exterior2nd      -640.4179     923.0366 -0.69 0.4880  
##  BsmtExposure    -2775.5999    1203.2649 -2.31 0.0213  
##  Condition1       -203.1715    1257.2726 -0.16 0.8717  
##  Id                 -1.7063       2.2156 -0.77 0.4414  
## 

With the correction done (it does not affect the result since it only make changes to the t statistic values), we can know assess the model via the kaggle metric: RMSE on the log values

prediction.reg <- predict(reg.model.2.corrected, testing_split)
rmse.reg <- data.frame(log(prediction.reg), log(testing_split$SalePrice))
rmse(rmse.reg$log.prediction.reg.,rmse.reg$log.testing_split.SalePrice.)
## [1] 0.1622399

Least absolute shrinkage and selection operator Regression

LASSO regression can help reduce multicollinearity: Similar to the xgboost function, we need to ensure they are in matrix form.

covariate.variable <- as.matrix(training_split[,1:76]) # exclude y 
dependent.variable <- as.matrix(training_split[,77])
testing.variable <- as.matrix(testing_split[,1:76])
lasso <- lars(covariate.variable, dependent.variable, type = 'lasso')
plot(lasso)

Since we need the least multicolliearity, we can use marrow’s cp, and we obtain the least value from our lasso model we ran, and use this value in the prediction function

optimal.step <- lasso$df[which.min(lasso$Cp)]
prediction.lasso <- predict.lars(lasso, newx = testing.variable, s = optimal.step, type = "fit" )

rmse.lasso <- data.frame(test = log(testing_split$SalePrice), lasso = log(prediction.lasso$fit))
rmse(rmse.lasso$test,rmse.lasso$lasso)
## [1] 0.1575195

Looking good with the improvement :)

Random Forest: Regression

rf <- randomForest(SalePrice ~. , data = training_split)
plot(rf)

importance(rf)
##               IncNodePurity
## Id             1.954005e+10
## MSSubClass     1.298422e+10
## MSZoning       9.289012e+09
## LotFrontage    4.418033e+10
## LotArea        9.177708e+10
## Street         1.744684e+08
## LotShape       9.191824e+09
## LandContour    1.227413e+10
## Utilities      2.137381e+06
## LotConfig      6.753246e+09
## LandSlope      1.037402e+10
## Neighborhood   3.360471e+10
## Condition1     4.160271e+09
## Condition2     8.661295e+08
## BldgType       3.527341e+09
## HouseStyle     6.155201e+09
## OverallQual    1.107987e+12
## OverallCond    2.193869e+10
## YearBuilt      2.827881e+11
## YearRemodAdd   7.750877e+10
## RoofStyle      4.643791e+09
## RoofMatl       3.631695e+09
## Exterior1st    1.039458e+10
## Exterior2nd    8.763262e+09
## MasVnrType     5.215892e+09
## MasVnrArea     4.590954e+10
## ExterQual      4.246745e+11
## ExterCond      6.953708e+09
## Foundation     5.841040e+09
## BsmtQual       8.982764e+10
## BsmtCond       3.417276e+09
## BsmtExposure   7.853481e+09
## BsmtFinType1   8.545312e+09
## BsmtFinSF1     1.341033e+11
## BsmtFinType2   2.350457e+09
## BsmtFinSF2     2.780870e+09
## BsmtUnfSF      2.612220e+10
## TotalBsmtSF    2.250196e+11
## Heating        5.407842e+08
## HeatingQC      8.979582e+09
## CentralAir     6.473470e+09
## Electrical     9.518527e+08
## X1stFlrSF      1.433146e+11
## X2ndFlrSF      1.869299e+11
## LowQualFinSF   2.280316e+09
## GrLivArea      5.995296e+11
## BsmtFullBath   7.381454e+09
## BsmtHalfBath   1.539058e+09
## FullBath       5.269536e+10
## HalfBath       7.935952e+09
## BedroomAbvGr   1.464085e+10
## KitchenAbvGr   3.712873e+09
## KitchenQual    7.717333e+10
## TotRmsAbvGrd   2.922406e+10
## Functional     1.506942e+09
## Fireplaces     3.810044e+10
## FireplaceQu    4.382973e+10
## GarageType     1.442236e+10
## GarageYrBlt    7.914855e+10
## GarageFinish   1.706542e+10
## GarageCars     5.003178e+11
## GarageArea     2.098376e+11
## GarageQual     4.029363e+09
## GarageCond     6.794172e+09
## PavedDrive     1.911447e+09
## WoodDeckSF     2.073760e+10
## OpenPorchSF    3.419679e+10
## EnclosedPorch  4.193206e+09
## X3SsnPorch     1.239280e+09
## ScreenPorch    4.338597e+09
## PoolArea       4.139390e+10
## MiscVal        7.484207e+08
## MoSold         1.420899e+10
## YrSold         7.766156e+09
## SaleType       3.617993e+09
## SaleCondition  1.843741e+10
prediction.rf <- predict(rf, newdata = testing_split)

rmse.rf <- data.frame(test = log(testing_split$SalePrice), rf = log(prediction.rf) )
rmse(rmse.rf$test,rmse.rf$rf)
## [1] 0.1522375

Very good prediction here!

XGboost

One of the most powerful algorithm here. Let us try this out in this dataset!

Data prep

For the data to be used in xgboost, wwe must ensure that they are in sparse matrix form.

training_split.sparse <- sparse.model.matrix(SalePrice ~. -1  , data = training_split[2:77]) # removing sales price as well as id 
d_train <- xgb.DMatrix(data = training_split.sparse, label = training_split$SalePrice )


testing_split.sparse <- sparse.model.matrix(SalePrice~. -1, data = testing_split[2:77])
d_valid <- xgb.DMatrix(data = testing_split.sparse, label = testing_split$SalePrice)

Parameter tuning

Using cross validation, we can obtain the optimal nrounds based on these parameters that we fixed. We define the early stopping round to be 50, and this meant that if there are no improvement for 50 boosting rounds, the iteration will stop. That is the optimal nrounds which we will use.

param.cv <- list(
  seed = random,
  objective="reg:linear",
  booster="gbtree",
  eta=0.1, 
  max.depth=8, 
  subsample=0.8, 
  colsample_bytree = 0.7,
  num_parallel_tree = 1,
  min_child_weight = 1
)


checking <- xgb.cv(data = d_train, params = param.cv, early_stopping_rounds = 50,
                   nrounds = 1000, verbose = 1, nfold = 5, metrics = "rmse", print_every_n = 50)
## [1]  train-rmse:177101.565625+887.275937 test-rmse:177134.181250+4174.168977 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [51] train-rmse:5894.306250+302.332955   test-rmse:30372.836719+4422.231596 
## [101]    train-rmse:1821.870605+195.657007   test-rmse:29936.927734+4490.337128 
## [151]    train-rmse:595.709705+116.521429    test-rmse:29903.619922+4506.269363 
## [201]    train-rmse:182.917391+42.345203 test-rmse:29890.121094+4515.339244 
## [251]    train-rmse:52.433025+9.974365   test-rmse:29885.723437+4519.550660 
## [301]    train-rmse:17.053361+4.297775   test-rmse:29885.227734+4519.455235 
## [351]    train-rmse:5.552435+1.476018    test-rmse:29885.065235+4519.469763 
## Stopping. Best iteration:
## [319]    train-rmse:11.283068+2.784754   test-rmse:29885.009766+4519.460684

Running the xgboost model

param <- list(
  seed = random,
  objective="reg:linear",
  booster="gbtree",
  eta=0.1, 
  max.depth=8, 
  subsample=0.8,
  colsample_bytree = 0.7,
  num_parallel_tree = 1,
  min_child_weight = 1,
  eval_metric = "rmse"
)

watchlist = list(train = d_train, test = d_valid)
xgmodel <- xgb.train(params = param, data = d_train, nrounds = 319, watchlist = watchlist, verbose = 1, print_every_n = 50)
## [1]  train-rmse:177116.031250    test-rmse:182382.984375 
## [51] train-rmse:6015.793457  test-rmse:28845.970703 
## [101]    train-rmse:2010.959961  test-rmse:27883.177734 
## [151]    train-rmse:714.633423   test-rmse:27845.777344 
## [201]    train-rmse:247.084396   test-rmse:27843.916016 
## [251]    train-rmse:82.004417    test-rmse:27840.111328 
## [301]    train-rmse:28.694693    test-rmse:27840.476562 
## [319]    train-rmse:20.059902    test-rmse:27840.406250
d_test <- xgb.DMatrix(data = testing_split.sparse)
xgb.predict <- predict(xgmodel, d_test)
rmse(log(testing_split$SalePrice),log(xgb.predict))
## [1] 0.1367712

Feature importance

Usually after we are done with a xgboost modelling, it will be good to take look at the f scores for the feature importance:

feature.importance <- xgb.importance(model = xgmodel)
head(feature.importance,10)
##     Feature       Gain       Cover   Frequency
##  1:      15 0.33531155 0.022654031 0.016126392
##  2:      44 0.16093956 0.053874109 0.030983955
##  3:      59 0.11719755 0.005054773 0.003233464
##  4:      25 0.08666229 0.002822118 0.002742305
##  5:      42 0.03418820 0.062727062 0.025171906
##  6:      32 0.03386903 0.050876949 0.042689915
##  7:      28 0.03135810 0.004794528 0.004256713
##  8:      36 0.02014810 0.039310844 0.031270465
##  9:      41 0.01986155 0.032183388 0.024926326
## 10:      18 0.01772089 0.026311366 0.027095612
xgb.plot.importance(importance_matrix = feature.importance)

colnames(train)[15]
## [1] "BldgType"

So feature 15 is deemed very important by xgboost,

Ensemble of model

Simply ensemble. More is better than 1.
(perhaps in future, I can do a comparison for all the different combination)

rmse.ensemble <- data.frame(test = log(testing_split$SalePrice), prediction = log(( prediction.rf + prediction.lasso$fit + xgb.predict) / 3))
rmse(rmse.ensemble$test, rmse.ensemble$prediction)
## [1] 0.1340795

SVM

model.svm <- svm(SalePrice ~. , training_split, type = "nu-regression", gamma = 0.01, cost = 3)

prediction.svm <- predict(model.svm, testing_split)

rmse(log(testing_split$SalePrice), log(prediction.svm))
## [1] 0.1724682

Submission

Let us retrain all the model on the entire training set, and have our test data set to fit on it. (I did not use regression in the end as it lower the overall)

reg.model.full <- lm(formula = SalePrice ~ OverallQual + GrLivArea+YearBuilt + KitchenQual + BsmtFullBath + MSSubClass + GarageCars + LotArea + ExterQual + OverallCond + PoolArea + WoodDeckSF + Exterior1st + BsmtQual + BsmtCond + Fireplaces + MasVnrType + MasVnrArea + 
    YrSold + ScreenPorch + Street + SaleCondition + GarageFinish + 
    BsmtFinType2 + BsmtFinType1 + BsmtFinSF2 + TotRmsAbvGrd + 
    LandSlope + EnclosedPorch + Functional + BedroomAbvGr + Neighborhood + 
    KitchenAbvGr + X1stFlrSF + FullBath + Exterior2nd + BsmtExposure + 
    Condition1, data = train)


prediction.reg.full <- predict(reg.model.full,test)
# create matrix of data sets 

train.covariates <- as.matrix(train[,1:76])
train.outcome <- as.matrix(train[,77])
test.matrix <- as.matrix(test[,1:76])

lasso.full <- lars(train.covariates, train.outcome, type = "lasso")

# obtain minimize cp 
best.s <- lasso.full$df[which.min(lasso.full$Cp)]

# predict 
prediction.lasso.full <- predict.lars(lasso.full, newx = test.matrix, s = best.s, type = "fit")
rf.full <- randomForest(SalePrice ~., data = train)
prediction.rf.full <- predict(rf.full, test)
# loading in the full train and test set 
training.sparse <- sparse.model.matrix(SalePrice ~. -1, data = train[2:77])
training <- xgb.DMatrix(training.sparse, label = train[,"SalePrice"])

testing <- xgb.DMatrix(data.matrix(test[,2:76]))


xgbmodel.full <- xgb.train(data = training, params = param, nrounds = 319, watchlist = list(train = training), verbose = 1, print_every_n = 50)
## [1]  train-rmse:178836.671875 
## [51] train-rmse:6537.377930 
## [101]    train-rmse:2892.638916 
## [151]    train-rmse:1353.408569 
## [201]    train-rmse:640.623413 
## [251]    train-rmse:313.330017 
## [301]    train-rmse:156.633575 
## [319]    train-rmse:129.403961
xgb.predict.full <- predict(xgbmodel.full, testing)
model.svm.full <- svm(SalePrice ~. , train, type = "nu-regression", gamma = 0.01, cost =3)
prediction.svm.full <- predict(model.svm.full, test)
ensemble.price <- (xgb.predict.full + prediction.rf.full +  prediction.lasso.full$fit) / 3
submission <- data.frame(Id = test$Id, SalePrice = ensemble.price)
write.csv(submission,'submission.csv', row.names = FALSE)